getwd()
## [1] "/Users/yuezhang/Documents/Biostat/Biostatistics/PH1820/Code"
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggthemes)
library(ggplot2)
library(readxl)
library(lmtest)
library(mfx)
library(pROC)
library(haven)
library(car)
library(PMCMRplus)
library(VGAM)
library(describedata)
library(olsrr)
library(ggpubr)
library(GGally)
library(knitr)
library(car)
library(MASS)
library(faraway)
library(corrplot)
library(ggcorrplot)
library(goftest)
#Research Question: #Predictive modeling for life expectancy - is there a significant difference in life expectancy between developed and developing countries?
#Load Data
life = read.csv("/Users/yuezhang/Documents/Biostat/Biostatistics/PH1820/Data/Raw/Life Expectancy Data.csv")
#EDA
#Check Missing Data
life %>% summarize(across(everything(), ~sum(is.na(.))))
## Country Year Status Life.expectancy Adult.Mortality infant.deaths Alcohol
## 1 0 0 0 10 10 0 194
## percentage.expenditure Hepatitis.B Measles BMI under.five.deaths Polio
## 1 0 553 0 34 0 19
## Total.expenditure Diphtheria HIV.AIDS GDP Population thinness..1.19.years
## 1 226 19 0 448 652 34
## thinness.5.9.years Income.composition.of.resources Schooling
## 1 34 167 163
str(life)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
life = life[, c(
"Life.expectancy",
"Status",
"Adult.Mortality",
"infant.deaths",
"Alcohol",
"BMI",
"thinness..1.19.years",
"thinness.5.9.years",
"Schooling",
"GDP",
"Income.composition.of.resources"
)]
life = life %>%
drop_na() %>%
rename(
"Life_expectancy" = "Life.expectancy",
"Adult_Mortality" = "Adult.Mortality",
"Infant_deaths" = "infant.deaths",
"Thinness_10_19_years" = "thinness..1.19.years",
"Thinness_5_9_years" = "thinness.5.9.years",
"Income" = "Income.composition.of.resources"
)
#Convert the adult mortality, under five deaths, and infant deaths into % as in Kaggle they're deaths per 1000
life = life %>% mutate(
Adult_Mortality = Adult_Mortality / 10,
Infant_deaths = Infant_deaths / 10
)
life$Status = as.factor(life$Status)
#Descriptive Statistics for variables
life %>%
summarise(across(where(is.numeric), list(
mean = ~mean(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
min = ~min(., na.rm = TRUE),
max = ~max(., na.rm = TRUE),
missing = ~sum(is.na(.))
))) %>%
pivot_longer(cols = everything(),
names_to = c("Variable", "Statistic"),
names_pattern = "(.*)_(.*)") %>%
pivot_wider(names_from = Statistic, values_from = value)
## # A tibble: 10 × 6
## Variable mean sd min max missing
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Life_expectancy 69.3 9.70 36.3 89 0
## 2 Adult_Mortality 16.2 12.8 0.1 72.3 0
## 3 Infant_deaths 3.16 12.9 0 180 0
## 4 Alcohol 4.61 4.03 0.01 17.9 0
## 5 BMI 38.1 19.8 1.4 77.1 0
## 6 Thinness_10_19_years 4.86 4.53 0.1 27.7 0
## 7 Thinness_5_9_years 4.90 4.62 0.1 28.6 0
## 8 Schooling 12.1 3.35 0 20.7 0
## 9 GDP 7597. 14518. 1.68 119173. 0
## 10 Income 0.630 0.215 0 0.948 0
#Histograms
life %>% select_if(is.numeric) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 20, fill = "skyblue", color = "black") +
facet_wrap(~ name, scales = "free") +
theme_minimal() +
ggtitle("Histograms of Numeric Variables")
life %>% select_if(is.factor) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = value)) +
geom_bar(fill = "gold", color = "black") +
facet_wrap(~ name, scales = "free") +
theme_minimal() +
ggtitle("Histograms of Categorical Variables")
#Boxplots
life %>%
dplyr::select(Life_expectancy, Adult_Mortality, Infant_deaths) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = "", y = value)) +
geom_boxplot(fill = "lightgreen", alpha = 0.6) +
facet_wrap(~ name, scales = "free", nrow = 1) +
theme_minimal() +
ggtitle("Boxplots of Numeric Variables") +
theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold")
)
life %>%
dplyr::select(Alcohol, BMI, GDP, Income) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = "", y = value)) +
geom_boxplot(fill = "lightgreen", alpha = 0.6) +
facet_wrap(~ name, scales = "free", nrow = 1) +
theme_minimal() +
ggtitle("Boxplots of Numeric Variables") +
theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold")
)
life %>%
dplyr::select(Thinness_5_9_years, Thinness_10_19_years, Schooling) %>%
pivot_longer(cols = everything()) %>%
ggplot(aes(x = "", y = value)) +
geom_boxplot(fill = "lightgreen", alpha = 0.6) +
facet_wrap(~ name, scales = "free", nrow = 1) +
theme_minimal() +
ggtitle("Boxplots of Numeric Variables") +
theme(
axis.text.x = element_blank(),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
strip.text = element_text(size = 12),
plot.title = element_text(size = 14, face = "bold")
)
ggplot(life, aes(x = Status, y = Life_expectancy, fill = Status)) +
geom_boxplot(alpha = 0.6) +
theme_minimal() +
ggtitle("Boxplots of Life Expectancy by Status")
#Scatterplot
life_numeric = life %>% select_if(is.numeric)
pairs(life_numeric, main = "Scatterplot Matrix of Numeric Variables")
#Correlation
cor_matrix = cor(life_numeric, method = "spearman", use = "pairwise.complete.obs")
print(cor_matrix)
## Life_expectancy Adult_Mortality Infant_deaths Alcohol
## Life_expectancy 1.0000000 -0.6238504 -0.6145628 0.4301817
## Adult_Mortality -0.6238504 1.0000000 0.3789025 -0.2053702
## Infant_deaths -0.6145628 0.3789025 1.0000000 -0.3594502
## Alcohol 0.4301817 -0.2053702 -0.3594502 1.0000000
## BMI 0.6026058 -0.3901869 -0.4876205 0.3428487
## Thinness_10_19_years -0.6257777 0.3823203 0.4516519 -0.4649118
## Thinness_5_9_years -0.6356402 0.3990258 0.4692331 -0.4558256
## Schooling 0.8158743 -0.4788217 -0.6130518 0.5665574
## GDP 0.6473679 -0.3839143 -0.5110845 0.4271796
## Income 0.8694348 -0.5358526 -0.5837963 0.5230530
## BMI Thinness_10_19_years Thinness_5_9_years
## Life_expectancy 0.6026058 -0.6257777 -0.6356402
## Adult_Mortality -0.3901869 0.3823203 0.3990258
## Infant_deaths -0.4876205 0.4516519 0.4692331
## Alcohol 0.3428487 -0.4649118 -0.4558256
## BMI 1.0000000 -0.5606424 -0.5717655
## Thinness_10_19_years -0.5606424 1.0000000 0.9398958
## Thinness_5_9_years -0.5717655 0.9398958 1.0000000
## Schooling 0.6354777 -0.5930163 -0.5936351
## GDP 0.4856409 -0.4257313 -0.4350441
## Income 0.6281214 -0.5942579 -0.5932238
## Schooling GDP Income
## Life_expectancy 0.8158743 0.6473679 0.8694348
## Adult_Mortality -0.4788217 -0.3839143 -0.5358526
## Infant_deaths -0.6130518 -0.5110845 -0.5837963
## Alcohol 0.5665574 0.4271796 0.5230530
## BMI 0.6354777 0.4856409 0.6281214
## Thinness_10_19_years -0.5930163 -0.4257313 -0.5942579
## Thinness_5_9_years -0.5936351 -0.4350441 -0.5932238
## Schooling 1.0000000 0.6723717 0.9018080
## GDP 0.6723717 1.0000000 0.6979678
## Income 0.9018080 0.6979678 1.0000000
ggcorrplot(cor_matrix, method = "circle",
lab = TRUE, lab_size = 3, colors = c("tomato", "white", "dodgerblue3"),
title = "Spearman Correlation Matrix",
ggtheme = theme_minimal()) +
theme(text = element_text(size = 10))
#Fit preliminary model
model_pre = lm(data = life, life$Life_expectancy ~ .)
summary(model_pre)
##
## Call:
## lm(formula = life$Life_expectancy ~ ., data = life)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.7096 -2.2343 0.2018 2.7638 23.9309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.782e+01 7.323e-01 78.954 < 2e-16 ***
## StatusDeveloping -1.708e+00 3.558e-01 -4.799 1.69e-06 ***
## Adult_Mortality -2.805e-01 9.158e-03 -30.629 < 2e-16 ***
## Infant_deaths -1.680e-03 9.003e-03 -0.187 0.8520
## Alcohol -1.764e-01 3.406e-02 -5.179 2.43e-07 ***
## BMI 5.200e-02 6.739e-03 7.717 1.77e-14 ***
## Thinness_10_19_years -1.244e-01 6.119e-02 -2.033 0.0421 *
## Thinness_5_9_years -1.450e-02 6.028e-02 -0.240 0.8100
## Schooling 9.249e-01 5.578e-02 16.580 < 2e-16 ***
## GDP 4.203e-05 8.238e-06 5.103 3.62e-07 ***
## Income 8.655e+00 7.993e-01 10.828 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.781 on 2300 degrees of freedom
## Multiple R-squared: 0.7583, Adjusted R-squared: 0.7573
## F-statistic: 721.7 on 10 and 2300 DF, p-value: < 2.2e-16
#Preliminary Diagnostics
#Linearity
#Residuals vs. x
fitted_vals = fitted(model_pre)
residuals = resid(model_pre)
predictors = c("Adult_Mortality", "Alcohol", "BMI", "Infant_deaths",
"Schooling", "Thinness_10_19_years", "Thinness_5_9_years", "GDP", "Income")
for (var in predictors) {
p = ggplot(life, aes(x = .data[[var]], y = residuals)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Residuals vs", var)) +
theme_minimal() +
ylab("Residuals") +
xlab(var)
print(p)
}
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals, y = residuals)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_pre)
##
## studentized Breusch-Pagan test
##
## data: model_pre
## BP = 585.98, df = 10, p-value < 2.2e-16
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality, since this is a large sample test, the Shapiro Wilk test will be sensitive, we will use Anderson-Darling test instead
goftest::ad.test(residuals)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals
## An = Inf, p-value = 2.596e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Not normally distributed
#VIF
vif(model_pre)
## StatusDeveloping Adult_Mortality Infant_deaths
## 1.914029 1.382652 1.369239
## Alcohol BMI Thinness_10_19_years
## 1.902441 1.801401 7.757667
## Thinness_5_9_years Schooling GDP
## 7.840726 3.528343 1.445231
## Income
## 2.981523
#Some Multicollinearity, as the thinness_5_9 and thinness_10_19 vif are larger than 5 and thinness_5_9 is a non-significant variable
#Potential Outliers
resid_stud = rstudent(model_pre)
ggplot(data = NULL, aes(x = fitted_vals, y = resid_stud)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = c(-2, 2), color = "red", linetype = "dashed") +
geom_hline(yintercept = 0, color = "black") +
labs(
title = "Studentized Residuals vs Fitted Values",
x = "Fitted Values",
y = "Studentized Residuals"
) +
theme_minimal()
potential_outliers = which(abs(resid_stud) > 2)
print(potential_outliers)
## 61 62 72 73 74 75 76 77 170 262 272 286 287 290 364 390
## 61 62 72 73 74 75 76 77 170 262 272 286 287 290 364 390
## 412 417 420 421 446 449 450 451 452 462 463 464 465 466 467 608
## 412 417 420 421 446 449 450 451 452 462 463 464 465 466 467 608
## 663 668 681 710 756 757 770 776 778 852 853 854 892 893 899 919
## 663 668 681 710 756 757 770 776 778 852 853 854 892 893 899 919
## 1136 1137 1138 1139 1140 1141 1142 1199 1219 1222 1225 1285 1292 1295 1326 1340
## 1136 1137 1138 1139 1140 1141 1142 1199 1219 1222 1225 1285 1292 1295 1326 1340
## 1342 1428 1429 1430 1431 1448 1449 1450 1451 1452 1453 1460 1461 1491 1554 1577
## 1342 1428 1429 1430 1431 1448 1449 1450 1451 1452 1453 1460 1461 1491 1554 1577
## 1578 1579 1580 1581 1582 1673 1719 1788 1790 1882 1884 1890 1892 1894 1896 1897
## 1578 1579 1580 1581 1582 1673 1719 1788 1790 1882 1884 1890 1892 1894 1896 1897
## 1907 1946 1966 2009 2011 2013 2014 2015 2101 2105 2191 2198 2199 2200 2203 2288
## 1907 1946 1966 2009 2011 2013 2014 2015 2101 2105 2191 2198 2199 2200 2203 2288
## 2293 2294 2304 2305 2309
## 2293 2294 2304 2305 2309
life = life[-potential_outliers,]
#Remove thinness_5_9 due to high vif and non significant p value
life$Thinness_5_9_years = NULL
model_2 = lm(data = life, life$Life_expectancy ~ .)
summary(model_2)
##
## Call:
## lm(formula = life$Life_expectancy ~ ., data = life)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0356 -2.0510 0.0906 2.2237 9.4981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.052e+01 5.867e-01 103.160 < 2e-16 ***
## StatusDeveloping -1.104e+00 2.575e-01 -4.287 1.89e-05 ***
## Adult_Mortality -3.517e-01 7.147e-03 -49.202 < 2e-16 ***
## Infant_deaths 6.538e-03 6.524e-03 1.002 0.316
## Alcohol -1.281e-01 2.566e-02 -4.993 6.41e-07 ***
## BMI 2.185e-02 4.951e-03 4.414 1.06e-05 ***
## Thinness_10_19_years -1.498e-01 2.288e-02 -6.545 7.37e-11 ***
## Schooling 9.438e-01 4.335e-02 21.770 < 2e-16 ***
## GDP 2.913e-05 5.930e-06 4.913 9.62e-07 ***
## Income 7.331e+00 6.132e-01 11.955 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.414 on 2184 degrees of freedom
## Multiple R-squared: 0.862, Adjusted R-squared: 0.8614
## F-statistic: 1516 on 9 and 2184 DF, p-value: < 2.2e-16
#Residuals vs. x
fitted_vals2 = fitted(model_2)
residuals2 = resid(model_2)
predictors2 = c("Adult_Mortality", "Alcohol", "BMI", "Infant_deaths",
"Schooling", "Thinness_10_19_years", "GDP", "Income")
for (var in predictors2) {
p = ggplot(life, aes(x = .data[[var]], y = residuals2)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ggtitle(paste("Residuals vs", var)) +
theme_minimal() +
ylab("Residuals") +
xlab(var)
print(p)
}
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals2, y = residuals2)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_2)
##
## studentized Breusch-Pagan test
##
## data: model_2
## BP = 337.65, df = 9, p-value < 2.2e-16
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality, since this is a large sample test, the Shapiro Wilk test will be sensitive, we will use Anderson-Darling test instead
goftest::ad.test(residuals2)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals2
## An = Inf, p-value = 2.735e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals2)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Not normally distributed
#Transformation
#Boxcox
b = boxcox(model_2)
lambda = b$x[which.max(b$y)]
lambda
## [1] 2
life$Life_expectancy_bc = (life$Life_expectancy^2-1)/2
model_bc = lm(data = life, Life_expectancy_bc ~ .-Life_expectancy )
summary(model_bc)
##
## Call:
## lm(formula = Life_expectancy_bc ~ . - Life_expectancy, data = life)
##
## Residuals:
## Min 1Q Median 3Q Max
## -728.90 -141.55 -7.96 147.44 751.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.846e+03 3.963e+01 46.585 < 2e-16 ***
## StatusDeveloping -1.157e+02 1.739e+01 -6.651 3.67e-11 ***
## Adult_Mortality -2.166e+01 4.828e-01 -44.868 < 2e-16 ***
## Infant_deaths 4.661e-01 4.407e-01 1.058 0.29
## Alcohol -8.742e+00 1.733e+00 -5.044 4.94e-07 ***
## BMI 1.358e+00 3.344e-01 4.060 5.08e-05 ***
## Thinness_10_19_years -1.093e+01 1.546e+00 -7.072 2.05e-12 ***
## Schooling 6.387e+01 2.929e+00 21.810 < 2e-16 ***
## GDP 2.937e-03 4.006e-04 7.332 3.18e-13 ***
## Income 5.004e+02 4.142e+01 12.079 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 230.6 on 2184 degrees of freedom
## Multiple R-squared: 0.8599, Adjusted R-squared: 0.8593
## F-statistic: 1489 on 9 and 2184 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(model_bc, cex.axis = 1.2, cex.lab = 1.5)
#Diagnostics
#Linearity
fitted_vals_bc = fitted(model_bc)
residuals_bc = resid(model_bc)
for (var in predictors2) {
p = ggplot(life, aes(x = .data[[var]], y = residuals_bc)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red")+
ggtitle(paste("Residuals vs", var)) +
theme_minimal() +
ylab("Residuals") +
xlab(var)
print(p)
}
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals_bc, y = residuals_bc)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_bc)
##
## studentized Breusch-Pagan test
##
## data: model_bc
## BP = 253.05, df = 9, p-value < 2.2e-16
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality
goftest::ad.test(residuals_bc)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals_bc
## An = Inf, p-value = 2.735e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals_bc)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Not normally distributed
#VIF
vif(model_bc)
## StatusDeveloping Adult_Mortality Infant_deaths
## 1.931741 1.566780 1.342180
## Alcohol BMI Thinness_10_19_years
## 2.030717 1.788878 2.022034
## Schooling GDP Income
## 3.597825 1.446324 2.952699
#No Multicollinearity
#Transformation again
life$Adult_Mortality_c = scale(life$Adult_Mortality, center = TRUE, scale = FALSE)
life$Adult_Mortality_sq = life$Adult_Mortality_c^2
model_3 = lm(
data = life,
Life_expectancy_bc ~
Status +
Adult_Mortality_c + Adult_Mortality_sq +
BMI +
Schooling +
Infant_deaths +
Alcohol +
GDP +
Income +
Thinness_10_19_years
)
summary(model_3)
##
## Call:
## lm(formula = Life_expectancy_bc ~ Status + Adult_Mortality_c +
## Adult_Mortality_sq + BMI + Schooling + Infant_deaths + Alcohol +
## GDP + Income + Thinness_10_19_years, data = life)
##
## Residuals:
## Min 1Q Median 3Q Max
## -652.7 -146.2 -10.2 136.5 754.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.501e+03 3.549e+01 42.278 < 2e-16 ***
## StatusDeveloping -1.303e+02 1.678e+01 -7.770 1.20e-14 ***
## Adult_Mortality_c -1.700e+01 5.826e-01 -29.187 < 2e-16 ***
## Adult_Mortality_sq -2.552e-01 1.925e-02 -13.252 < 2e-16 ***
## BMI 1.443e+00 3.219e-01 4.482 7.76e-06 ***
## Schooling 6.683e+01 2.827e+00 23.641 < 2e-16 ***
## Infant_deaths 3.448e-01 4.241e-01 0.813 0.416
## Alcohol -9.330e+00 1.668e+00 -5.593 2.52e-08 ***
## GDP 3.357e-03 3.867e-04 8.681 < 2e-16 ***
## Income 4.956e+02 3.986e+01 12.432 < 2e-16 ***
## Thinness_10_19_years -1.011e+01 1.489e+00 -6.792 1.42e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 221.9 on 2183 degrees of freedom
## Multiple R-squared: 0.8703, Adjusted R-squared: 0.8697
## F-statistic: 1465 on 10 and 2183 DF, p-value: < 2.2e-16
#Diagnostics
#Linearity
fitted_vals3 = fitted(model_3)
residuals3 = resid(model_3)
predictors3 = c("Adult_Mortality_c", "Alcohol", "BMI", "Infant_deaths",
"Schooling", "Thinness_10_19_years", "GDP", "Income")
for (var in predictors3) {
p = ggplot(life, aes(x = .data[[var]], y = residuals3)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red")+
ggtitle(paste("Residuals vs", var)) +
theme_minimal() +
ylab("Residuals") +
xlab(var)
print(p)
}
ggplot(data = NULL, aes(x = life$Adult_Mortality_sq, y = residuals3)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
ggtitle("Residuals vs. Adult_Mortality_sq") +
xlab("Adult_Mortality_sq") +
ylab("Residuals")
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals3, y = residuals3)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_3)
##
## studentized Breusch-Pagan test
##
## data: model_3
## BP = 186.46, df = 10, p-value < 2.2e-16
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality
goftest::ad.test(residuals3)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals3
## An = Inf, p-value = 2.735e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals3)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Not normally distributed
#Weighted Leasted Squares adjusting for non-constant variance
weights = 1/fitted(lm(abs(residuals3) ~ fitted_vals3))^2
model_wls = lm(
data = life,
Life_expectancy_bc ~
Status +
Adult_Mortality_c + Adult_Mortality_sq +
BMI +
Schooling +
Infant_deaths +
Alcohol +
GDP +
Income +
Thinness_10_19_years,
weights = weights
)
summary(model_wls)
##
## Call:
## lm(formula = Life_expectancy_bc ~ Status + Adult_Mortality_c +
## Adult_Mortality_sq + BMI + Schooling + Infant_deaths + Alcohol +
## GDP + Income + Thinness_10_19_years, data = life, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -3.7940 -0.8385 -0.0562 0.7860 4.4386
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.501e+03 3.571e+01 42.038 < 2e-16 ***
## StatusDeveloping -1.287e+02 1.660e+01 -7.754 1.35e-14 ***
## Adult_Mortality_c -1.696e+01 5.834e-01 -29.065 < 2e-16 ***
## Adult_Mortality_sq -2.625e-01 1.965e-02 -13.363 < 2e-16 ***
## BMI 1.375e+00 3.205e-01 4.291 1.86e-05 ***
## Schooling 6.651e+01 2.839e+00 23.426 < 2e-16 ***
## Infant_deaths 3.563e-01 4.292e-01 0.830 0.407
## Alcohol -9.195e+00 1.665e+00 -5.524 3.71e-08 ***
## GDP 3.337e-03 3.799e-04 8.784 < 2e-16 ***
## Income 5.052e+02 4.017e+01 12.575 < 2e-16 ***
## Thinness_10_19_years -1.031e+01 1.502e+00 -6.863 8.78e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.279 on 2183 degrees of freedom
## Multiple R-squared: 0.8684, Adjusted R-squared: 0.8678
## F-statistic: 1441 on 10 and 2183 DF, p-value: < 2.2e-16
#Diagnostics
#Linearity
fitted_vals_wls = fitted(model_wls)
residuals_wls = resid(model_wls)
for (var in predictors3) {
p = ggplot(life, aes(x = .data[[var]], y = residuals_wls)) +
geom_point(alpha = 0.5) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red")+
ggtitle(paste("Residuals vs", var)) +
theme_minimal() +
ylab("Residuals") +
xlab(var)
print(p)
}
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals_wls, y = residuals_wls)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_wls)
##
## studentized Breusch-Pagan test
##
## data: model_wls
## BP = 0.025449, df = 10, p-value = 1
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality
goftest::ad.test(residuals_wls)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals_wls
## An = Inf, p-value = 2.735e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals_wls)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Normality applies
#Check outliers and influential points
#Studentized deleted residuals. Threshold using Bonferroni-adjusted p-values
stu_del_resid = rstudent(model_wls)
n = nrow(life)
alpha = 0.05
t_critical = qt(1-alpha/(2*n), df = n-length(coefficients(model_wls)))
stu_del_resid_df = data.frame(ID = as.numeric(rownames(model_wls$model)), rstudent = stu_del_resid)
out_stud = which(abs(stu_del_resid) > t_critical)
ggplot(stu_del_resid_df, aes(x = ID, y = rstudent)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = c(-t_critical, t_critical), color = "red", linetype = "dashed") +
geom_text(aes(label = ifelse(abs(stu_del_resid) > t_critical, ID, "")),
vjust = -0.5, size = 3, color = "black") +
theme_minimal() +
ggtitle("Studentized Deleted Residuals with t Critical Bounds") +
ylab("Studentized Deleted Residuals") +
xlab("Observation ID")
#High leverage
h = hatvalues(model_wls)
p = length(coef(model_wls))
avgLeverage = 2*p/n
highLeverage = which(h > avgLeverage)
leverage_df = data.frame(
ID = as.numeric(rownames(model_wls$model)),
leverage = h
)
ggplot(leverage_df, aes(x = ID, y = leverage)) +
geom_point(color = "blue") +
geom_hline(yintercept = avgLeverage, color = "red", linetype = "dashed") +
geom_text(aes(label = ifelse(leverage > avgLeverage, ID, "")),
vjust = -0.5, size = 3, color = "black") +
theme_minimal() +
labs(
title = "Leverage Values with Threshold",
x = "Observation ID",
y = "Leverage"
)
#Influential points for prediction
dffits_val = dffits(model_wls)
cd_val = cooks.distance(model_wls)
dffits_threshold = 2*sqrt(p/n)
cooks_threshold = qf(0.5, df1 = p, df2 = n-p)
out_dffits = which(abs(dffits_val) > dffits_threshold)
dffits_df = data.frame(
ID = as.numeric(rownames(model_wls$model)),
DFFITS = dffits_val
)
ggplot(dffits_df, aes(x = ID, y = DFFITS)) +
geom_point(color = "blue") +
geom_hline(yintercept = c(-dffits_threshold, dffits_threshold), linetype = "dashed", color = "red") +
geom_text(aes(label = ifelse(abs(DFFITS) > dffits_threshold, ID, "")),
vjust = -0.5, size = 3) +
theme_minimal() +
labs(
title = "DFFITS with Threshold",
x = "Observation ID",
y = "DFFITS"
)
out_cd = which(cd_val > cooks_threshold)
cooks_df = data.frame(
CooksD = cd_val,
ID = as.numeric(rownames(model_wls$model))
)
ggplot(cooks_df, aes(x = ID, y = CooksD)) +
geom_point(color = "blue") +
geom_hline(yintercept = cooks_threshold, linetype = "dashed", color = "red") +
geom_text(aes(label = ifelse(CooksD > cooks_threshold, ID, "")),
vjust = -0.5, size = 3) +
theme_minimal() +
labs(
title = "Cook's Distance with Threshold",
x = "Observation ID",
y = "Cook's Distance"
)
#Removing influential points and outliers
outliers = unique(c(out_dffits, out_cd, out_stud, highLeverage))
life_clean = life[-outliers, ]
model_3c = lm(
data = life_clean,
Life_expectancy_bc ~
Status +
Adult_Mortality_c + Adult_Mortality_sq +
BMI +
Schooling +
Infant_deaths +
Alcohol +
GDP +
Income +
Thinness_10_19_years
)
fitted_vals_3c = fitted(model_3c)
residuals_3c = resid(model_3c)
weights_c = 1/fitted(lm(abs(residuals_3c) ~ fitted_vals_3c))^2
model_wls_c = lm(
data = life_clean,
Life_expectancy_bc ~
Status +
Adult_Mortality_c + Adult_Mortality_sq +
BMI +
Schooling +
Infant_deaths +
Alcohol +
GDP +
Income +
Thinness_10_19_years,
weights = weights_c
)
summary(model_wls_c)
##
## Call:
## lm(formula = Life_expectancy_bc ~ Status + Adult_Mortality_c +
## Adult_Mortality_sq + BMI + Schooling + Infant_deaths + Alcohol +
## GDP + Income + Thinness_10_19_years, data = life_clean, weights = weights_c)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -4.2020 -0.8713 -0.0750 0.7997 5.0461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.456e+03 3.560e+01 40.909 < 2e-16 ***
## StatusDeveloping -9.641e+01 1.352e+01 -7.130 1.41e-12 ***
## Adult_Mortality_c -1.804e+01 5.496e-01 -32.828 < 2e-16 ***
## Adult_Mortality_sq -3.785e-01 2.482e-02 -15.246 < 2e-16 ***
## BMI 5.598e-01 2.792e-01 2.005 0.045100 *
## Schooling 2.061e+01 3.328e+00 6.193 7.18e-10 ***
## Infant_deaths 2.768e-01 9.027e-01 0.307 0.759160
## Alcohol -5.381e+00 1.420e+00 -3.790 0.000155 ***
## GDP 1.906e-03 3.744e-04 5.090 3.92e-07 ***
## Income 1.423e+03 7.230e+01 19.676 < 2e-16 ***
## Thinness_10_19_years -9.277e+00 1.334e+00 -6.956 4.77e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.26 on 1921 degrees of freedom
## Multiple R-squared: 0.9017, Adjusted R-squared: 0.9012
## F-statistic: 1763 on 10 and 1921 DF, p-value: < 2.2e-16
fitted_vals_wls_c = fitted(model_wls_c)
residuals_wls_c = resid(model_wls_c)
# Residuals vs Fitted Plot
ggplot(data = NULL, aes(x = fitted_vals_wls_c, y = residuals_wls_c)) +
geom_point(alpha = 0.6, color = "black") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(x = "Fitted Values", y = "Residuals", title = "Residuals vs. Fitted Values") +
theme_minimal()
#Linear
#Homoscedasticity
bptest(model_wls_c)
##
## studentized Breusch-Pagan test
##
## data: model_wls_c
## BP = 0.031076, df = 10, p-value = 1
#Non constant variance. Also the residuals vs. fitted has a funnel shape
#Normality
goftest::ad.test(residuals_wls_c)
##
## Anderson-Darling test of goodness-of-fit
## Null hypothesis: uniform distribution
## Parameters assumed to be fixed
##
## data: residuals_wls_c
## An = Inf, p-value = 3.106e-07
# Q-Q Plot of residuals
ggplot(data = NULL, aes(sample = residuals_wls_c)) +
stat_qq() +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of Residuals") +
theme_minimal()
#Normality applies
#The qq plot improves a lot so we decided to remove all the influential points and outliers